started: Alexey Larionov, 01Mar2016
last updated: Alexey Larionov, 23Apr2019

Summary

Remove genotypes (set to NA) if:
1) genotypes gq < 20 (~12%)
2) genotypes dp < 10 (~18%, +5% after genotypes)
3) genotypes dp > 1000 (~1%)
4) Alt fraction

Remove variants with
1) call_rate < 0.85
(in each: nfe and wecare after the genotypes filtering)
this step removes ~73% of all variants !!!
2) uniform genotypes accross all samples
(created by the above filtering)

In addition
1) rename gt.mx to gt.mx
2) remove unnecessary matrices (gt_num, meta, gq, ref, alt); keep dp and gt_chr for later use

gq 20 filter is set arbitrary; however, it is consistent with what some othersis do
(e.g. see Carson BMC Bioinformatics. 2014 15:125).

A small number of genotypes (~1%) was covered too high to be true (above 1000 coverage).
These are obvious mistakes, and they have been removed too. Arbitrarily the threshold for
max DP was set to 1000 (appr. 10 fold of average coverage).

gt NA rate before filtering ~5% (~3% in wecare and ~12% in NFE respectively)
gt NA rate after filtering ~4% (~3% in wecare and ~6% in NFE respectively)

Input data: 8,275 vars x 739 cases (541 wecare + 198 nfe)
Output data: 1.886 vars x 739 cases (541 wecare + 198 nfe)

start_section

# Time stamp
Sys.time()
## [1] "2019-04-23 15:00:36 BST"
# Clenan-up
rm(list=ls())
graphics.off()

# Base folder
library(knitr)
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s07_filter_genotypes_and_variants"
opts_knit$set(root.dir = base_folder)
options(stringsAsFactors = F,
        warnPartialMatchArgs = T, 
        warnPartialMatchAttr = T, 
        warnPartialMatchDollar = T)
#setwd(base_folder)

# Thresholds for genotypes
min.gq <- 20
min.dp <- 10
max.dp <- 1000
min.call.rate <- 0.85

# add parameters to filter by Alt fraction here?

load_data

load(paste(base_folder, "r01_filter_variants.RData", sep="/"))

gt.mx <- gt_add.mx
rm(gt_add.mx, gt_num.mx, meta.df) # keep gt_chr.mx for TsTv ratio check later

check_data

# List objects
ls()
##  [1] "alt.mx"        "base_folder"   "dp.mx"         "fixed.df"      "gq.mx"         "gt_chr.mx"     "gt.mx"         "max.dp"        "min.call.rate" "min.dp"        "min.gq"        "ref.mx"
# Check sizes
dim(gt.mx)
## [1] 8275  739
dim(ref.mx)
## [1] 8275  739
dim(alt.mx)
## [1] 8275  739
dim(gq.mx)
## [1] 8275  739
dim(dp.mx)
## [1] 8275  739
dim(fixed.df)
## [1] 8275  110
colnames(fixed.df)
##   [1] "CHROM"                   "POS"                     "ID"                      "REF"                     "ALT"                     "QUAL"                    "FILTER"                  "init_AC"                 "init_AF"                 "init_AN"                 "AS_FilterStatus"         "AS_VQSLOD"               "AS_culprit"              "BaseQRankSum"            "ClippingRankSum"         "DB"                      "DP"                      "DS"                      "END"                     "ExcessHet"               "FS"                      "HaplotypeScore"          "InbreedingCoeff"         "MLEAC"                   "MLEAF"                   "MQ"                      "MQRankSum"               "NDA"                     "NEGATIVE_TRAIN_SITE"     "POSITIVE_TRAIN_SITE"     "QD"                      "ReadPosRankSum"          "SOR"                     "SplitVarID"              "VQSLOD"                  "culprit"                 "exac_non_TCGA.AC"        "exac_non_TCGA.AC_AFR"   
##  [39] "exac_non_TCGA.AC_AMR"    "exac_non_TCGA.AC_Adj"    "exac_non_TCGA.AC_EAS"    "exac_non_TCGA.AC_FEMALE" "exac_non_TCGA.AC_FIN"    "exac_non_TCGA.AC_Hemi"   "exac_non_TCGA.AC_Het"    "exac_non_TCGA.AC_Hom"    "exac_non_TCGA.AC_MALE"   "exac_non_TCGA.AC_NFE"    "exac_non_TCGA.AC_OTH"    "exac_non_TCGA.AC_SAS"    "exac_non_TCGA.AF"        "exac_non_TCGA.AN"        "exac_non_TCGA.AN_AFR"    "exac_non_TCGA.AN_AMR"    "exac_non_TCGA.AN_Adj"    "exac_non_TCGA.AN_EAS"    "exac_non_TCGA.AN_FEMALE" "exac_non_TCGA.AN_FIN"    "exac_non_TCGA.AN_MALE"   "exac_non_TCGA.AN_NFE"    "exac_non_TCGA.AN_OTH"    "exac_non_TCGA.AN_SAS"    "exac_non_TCGA.Hemi_AFR"  "exac_non_TCGA.Hemi_AMR"  "exac_non_TCGA.Hemi_EAS"  "exac_non_TCGA.Hemi_FIN"  "exac_non_TCGA.Hemi_NFE"  "exac_non_TCGA.Hemi_OTH"  "exac_non_TCGA.Hemi_SAS"  "exac_non_TCGA.Het_AFR"   "exac_non_TCGA.Het_AMR"   "exac_non_TCGA.Het_EAS"   "exac_non_TCGA.Het_FIN"   "exac_non_TCGA.Het_NFE"   "exac_non_TCGA.Het_OTH"   "exac_non_TCGA.Het_SAS"  
##  [77] "exac_non_TCGA.Hom_AFR"   "exac_non_TCGA.Hom_AMR"   "exac_non_TCGA.Hom_EAS"   "exac_non_TCGA.Hom_FIN"   "exac_non_TCGA.Hom_NFE"   "exac_non_TCGA.Hom_OTH"   "exac_non_TCGA.Hom_SAS"   "kgen.AC"                 "kgen.AF"                 "kgen.AFR_AF"             "kgen.AMR_AF"             "kgen.AN"                 "kgen.EAS_AF"             "kgen.EUR_AF"             "kgen.SAS_AF"             "SYMBOL"                  "Allele"                  "Existing_variation"      "Consequence"             "IMPACT"                  "CLIN_SIG"                "cDNA_position"           "CDS_position"            "Codons"                  "Protein_position"        "Amino_acids"             "DISTANCE"                "STRAND"                  "SYMBOL_SOURCE"           "Multiallelic"            "SIFT_call"               "SIFT_score"              "PolyPhen_call"           "PolyPhen_score"
# Check consistence of rownames and colnames

sum(rownames(gt.mx) != rownames(gq.mx))
## [1] 0
sum(rownames(gt.mx) != rownames(dp.mx))
## [1] 0
sum(rownames(gt.mx) != rownames(ref.mx))
## [1] 0
sum(rownames(gt.mx) != rownames(alt.mx))
## [1] 0
sum(colnames(gt.mx) != colnames(gq.mx))
## [1] 0
sum(colnames(gt.mx) != colnames(dp.mx))
## [1] 0
sum(colnames(gt.mx) != colnames(ref.mx))
## [1] 0
sum(colnames(gt.mx) != colnames(alt.mx))
## [1] 0
sum(rownames(gt.mx) != rownames(fixed.df))
## [1] 0

explore_data_before_filtering

Genotypes NA rates
Histogram of call rates per variant
Histograms of gq and dp in non-NA genotypes

gt.mx[1:3,c(1,2,541,542,738,739)]
##              100_S8_L007 101_S528_L008 9_S346_L008 HG00097 NA20828 NA20832
## Var000000001           0             0           0       0       0       0
## Var000000002           0             0           0       0       0       0
## Var000000003           0             0           0       0       0       0
# Prepare matrices with sub-groups data
gt_nfe.mx <- gt.mx[,542:739]
gq_nfe.mx <- gq.mx[,542:739]
dp_nfe.mx <- dp.mx[,542:739]
gt_wecare.mx <- gt.mx[,1:541]
gq_wecare.mx <- gq.mx[,1:541]
dp_wecare.mx <- dp.mx[,1:541]

gt_nfe.mx[1:5,1:5]
##              HG00097 HG00099 HG00100 HG00102 HG00106
## Var000000001       0       0       0       0       0
## Var000000002       0       0       0       0       0
## Var000000003       0       0       0       1       0
## Var000000004       1       2       1       1       2
## Var000000005       1       2       1       1       2
gt_wecare.mx[1:5,1:5]
##              100_S8_L007 101_S528_L008 102_S466_L008 103_S147_L007 104_S325_L008
## Var000000001           0             0             0             0             0
## Var000000002           0             0             0             0             0
## Var000000003           0             0             0             0             0
## Var000000004           0             2             2             2             1
## Var000000005           0             2             2             2             1
# --- Fractions of NA genotypes before filtering --- #

sum(is.na(gt.mx))/(nrow(gt.mx)*ncol(gt.mx)) # ~5%
## [1] 0.05399147
sum(is.na(gt_nfe.mx))/(nrow(gt_nfe.mx)*ncol(gt_nfe.mx)) # ~12%
## [1] 0.1254826
sum(is.na(gt_wecare.mx))/(nrow(gt_wecare.mx)*ncol(gt_wecare.mx)) # ~3%
## [1] 0.0278265
# --- Call rates per variant before filtering --- #

# All samples
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant before filtering\n739 wecare-nfe samples", xlab="Call rates")

# nfe
x <- ncol(gt_nfe.mx)
y <- apply(gt_nfe.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant before filtering\n198 nfe samples", xlab="Call rates")

# wecare
x <- ncol(gt_wecare.mx)
y <- apply(gt_wecare.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant before filtering\n541 wecare samples", xlab="Call rates")

# --- gq  before filtering (when gt is not NA !) --- #

# All samples
hist(gq.mx[!is.na(gt.mx)], breaks=50, 
     main="Histogram of gq in non-NA genotypes before filtering\n739 wecare-nfe samples", xlab="gq")

hist(gq_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, 
     main="Histogram of gq in non-NA genotypes before filtering\n198 nfe samples", xlab="gq")

hist(gq_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, 
     main="Histogram of gq in non-NA genotypes before filtering\n541 wecare samples", xlab="gq")

# --- dp before filtering (when gt is not NA !) --- #

# All samples
hist(dp.mx[!is.na(gt.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes before filtering\n739 wecare-nfe samples", xlab="dp")

hist(dp.mx[!is.na(gt.mx)], breaks=2500, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes before filtering (zoom, 0:100)\n739 wecare-nfe samples", xlab="dp")

# nfe
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes before filtering\n198 nfe samples", xlab="dp")

hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=2500, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes before filtering (zoom, 0:100)\n198 nfe samples", xlab="dp")

# wecare
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes before filtering\n541 wecare samples", xlab="dp")

hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=2500, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes before filtering (zoom, 0:100)\n541 wecare samples", xlab="dp")

# Mean GQ and DP in non-NA genotypes before filtering
quantile(gq.mx[!is.na(gt.mx)], na.rm=TRUE) # median gq ~ 99
##   0%  25%  50%  75% 100% 
##    0   45   99   99   99
quantile(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # median dp ~ 35
##   0%  25%  50%  75% 100% 
##    0   16   35   53 9901
mean(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # mean dp ~89
## [1] 89.25717
# crude estimates for proportions of genotypes removed by filters
sum(dp.mx > max.dp, na.rm = T) / sum(!is.na(dp.mx))
## [1] 0.0122493
sum(dp.mx < min.dp, na.rm = T) / sum(!is.na(dp.mx))
## [1] 0.1797405
sum(gq.mx < min.gq, na.rm = T) / sum(!is.na(gq.mx))
## [1] 0.1190502
# Clean-up
rm(x,y, gt_nfe.mx, gq_nfe.mx, dp_nfe.mx, gt_wecare.mx, gq_wecare.mx, dp_wecare.mx)

filter_out_low_gq

Put NA to genotypes where gq < 20 : removes ~12% of non-NA genotypes

# num of genotypes to be removed (in !NA gt)
format(sum(gq.mx[!is.na(gt.mx)] < min.gq, na.rm=TRUE), big.mark=",") # ~687K
## [1] "686,697"
# Fraction of non-NA genotypes to be removed (in !NA gt)
sum(gq.mx[!is.na(gt.mx)] < min.gq, na.rm=TRUE)/sum(!is.na(gt.mx)) # ~12%
## [1] 0.1187019
# Apply filter (to gt only !)
NA -> gt.mx[ gq.mx < min.gq ]
NA -> gt_chr.mx[ gq.mx < min.gq ]

# Clean up
rm(min.gq)

explore_data_after_gq_filtering

# Prepare matrices with sub-groups data
gt_nfe.mx <- gt.mx[,542:739]
gq_nfe.mx <- gq.mx[,542:739]
dp_nfe.mx <- dp.mx[,542:739]
gt_wecare.mx <- gt.mx[,1:541]
gq_wecare.mx <- gq.mx[,1:541]
dp_wecare.mx <- dp.mx[,1:541]

# --- Fractions of NA genotypes after gq filtering --- #

sum(is.na(gt.mx))/(nrow(gt.mx)*ncol(gt.mx)) # ~17%
## [1] 0.1662845
sum(is.na(gt_nfe.mx))/(nrow(gt_nfe.mx)*ncol(gt_nfe.mx)) # ~22%
## [1] 0.225728
sum(is.na(gt_wecare.mx))/(nrow(gt_wecare.mx)*ncol(gt_wecare.mx)) # ~14%
## [1] 0.1445288
# --- Call rates per variant after gq filtering --- #

# All samples
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq filtering\n739 wecare-nfe samples", xlab="Call rates")

# nfe
x <- ncol(gt_nfe.mx)
y <- apply(gt_nfe.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq filtering\n198 nfe samples", xlab="Call rates")

# wecare
x <- ncol(gt_wecare.mx)
y <- apply(gt_wecare.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq filtering\n541 wecare samples", xlab="Call rates")

# --- gq  after gq filtering (when gt is not NA !) --- #

# All samples
hist(gq.mx[!is.na(gt.mx)], breaks=50, xlim=c(0,100), 
     main="Histogram of gq in non-NA genotypes after gq filtering\n739 wecare-nfe samples", xlab="gq")

hist(gq_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, xlim=c(0,100), 
     main="Histogram of gq in non-NA genotypes after gq filtering\n198 nfe samples", xlab="gq")

hist(gq_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, xlim=c(0,100), 
     main="Histogram of gq in non-NA genotypes after gq filtering\n541 wecare samples", xlab="gq")

# --- dp after gq filtering (when gt is not NA !) --- #

# All samples
hist(dp.mx[!is.na(gt.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after gq filtering\n739 wecare-nfe samples", xlab="dp")

hist(dp.mx[!is.na(gt.mx)], breaks=2500, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after gq filtering (zoom 0:100)\n739 wecare-nfe samples", xlab="dp")

# nfe
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after gq filtering\n198 nfe samples", xlab="dp")

hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=2500, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after gq filtering (zoom 0:100)\n198 nfe samples", xlab="dp")

# wecare
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after gq filtering\n541 wecare samples", xlab="dp")

hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=2500, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after gq filtering (zoom 0:100)\n541 wecare samples", xlab="dp")

# Mean GQ and DP in non-NA genotypes after gq filtering
mean(gq.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 81
## [1] 81.69107
mean(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 100
## [1] 100.3238
# Clean-up
rm(x,y, gt_nfe.mx, gq_nfe.mx, dp_nfe.mx, gt_wecare.mx, gq_wecare.mx, dp_wecare.mx)

filter_out_low_dp

put NA to genotypes where dp < 10 : additionally removes ~5% of non-NA genotypes

# num of genotypes to be removed (in !NA gt after gq filtering)
format(sum(dp.mx[!is.na(gq.mx)] < min.dp, na.rm=TRUE), big.mark=",") # ~871K
## [1] "871,268"
# Fraction of genotypes to be removed (in !NA gt after gq filtering)
sum(dp.mx[!is.na(gq.mx)] < min.dp, na.rm=TRUE)/sum(!is.na(gt.mx)) # ~17%
## [1] 0.1708919
# Apply filter (to gt only !)
NA -> gt.mx[ dp.mx < min.dp ]
NA -> gt_chr.mx[ dp.mx < min.dp ]

# Clean up
rm(min.dp)

explore_data_after_gq_min-dp_filtering

# Prepare matrices with sub-groups data
gt_nfe.mx <- gt.mx[,542:739]
gq_nfe.mx <- gq.mx[,542:739]
dp_nfe.mx <- dp.mx[,542:739]
gt_wecare.mx <- gt.mx[,1:541]
gq_wecare.mx <- gq.mx[,1:541]
dp_wecare.mx <- dp.mx[,1:541]

# --- Fractions of NA genotypes after gq filtering --- #

sum(is.na(gt.mx))/(nrow(gt.mx)*ncol(gt.mx)) # ~21%
## [1] 0.2075736
sum(is.na(gt_nfe.mx))/(nrow(gt_nfe.mx)*ncol(gt_nfe.mx)) # ~25%
## [1] 0.2469261
sum(is.na(gt_wecare.mx))/(nrow(gt_wecare.mx)*ncol(gt_wecare.mx)) # ~19%
## [1] 0.193171
# --- Call rates per variant after gq-dp filtering --- #

# All samples
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-low-dp filtering\n739 wecare-nfe samples", xlab="Call rates")

# nfe
x <- ncol(gt_nfe.mx)
y <- apply(gt_nfe.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-low-dp filtering\n198 nfe samples", xlab="Call rates")

# wecare
x <- ncol(gt_wecare.mx)
y <- apply(gt_wecare.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-low-dp filtering\n541 wecare samples", xlab="Call rates")

# --- gq  after gq-dp filtering (when gt is not NA !) --- #

# All samples
hist(gq.mx[!is.na(gt.mx)], breaks=50, xlim=c(0,100), 
     main="Histogram of gq in non-NA genotypes after gq-low-dp filtering\n739 wecare-nfe samples", xlab="gq")

hist(gq_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, xlim=c(0,100), 
     main="Histogram of gq in non-NA genotypes after gq-low-dp filtering\n198 nfe samples", xlab="gq")

hist(gq_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, xlim=c(0,100), 
     main="Histogram of gq in non-NA genotypes after gq-low-dp filtering\n541 wecare samples", xlab="gq")

# --- dp after gq-dp filtering (when gt is not NA !) --- #

# All samples
hist(dp.mx[!is.na(gt.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after gq-low-dp filtering\n739 wecare-nfe samples", xlab="dp")

hist(dp.mx[!is.na(gt.mx)], breaks=2500, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after gq-low-dp filtering (zoom, 0:100)\n739 wecare-nfe samples", xlab="dp")

# nfe
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after gq-low-dp filtering\n198 nfe samples", xlab="dp")

hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=250, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after gq-low-dp filtering (zoom, 0:100)\n198 nfe samples", xlab="dp")

# wecare
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after gq-low-dp filtering\n541 wecare samples", xlab="dp")

hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=2500, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after gq-low-dp filtering (zoom, 0:100)\n541 wecare samples", xlab="dp")

# Mean GQ and DP in non-NA genotypes after gq-dp filtering
mean(gq.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 85
## [1] 84.50231
mean(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 105
## [1] 105.1593
# Clean-up
rm(x,y, gt_nfe.mx, gq_nfe.mx, dp_nfe.mx, gt_wecare.mx, gq_wecare.mx, dp_wecare.mx)

filter_out_high_dp

put NA to genotypes where dp > 1000 : removes <<1% of non-NA genotypes

# num of genotypes to be removed (in !NA gt after previous filtering)
format(sum(dp.mx[!is.na(gq.mx)] > max.dp, na.rm=TRUE), big.mark=",") # ~73K
## [1] "72,630"
# Fraction of genotypes to be removed (in !NA gt after previous filtering)
sum(dp.mx[!is.na(gq.mx)] > max.dp, na.rm=TRUE)/sum(!is.na(gt.mx)) # ~1%
## [1] 0.01498803
# Apply filter (to gt only !)
NA -> gt.mx[ dp.mx > max.dp ]
NA -> gt_chr.mx[ dp.mx > max.dp ]

# Clean up
rm(max.dp)

explore_data_after_gq_dp_filtering

# Prepare matrices with sub-groups data
gt_nfe.mx <- gt.mx[,542:739]
gq_nfe.mx <- gq.mx[,542:739]
dp_nfe.mx <- dp.mx[,542:739]
gt_wecare.mx <- gt.mx[,1:541]
gq_wecare.mx <- gq.mx[,1:541]
dp_wecare.mx <- dp.mx[,1:541]

# --- Fractions of NA genotypes after gq-dp filtering --- #

sum(is.na(gt.mx))/(nrow(gt.mx)*ncol(gt.mx)) # ~22%
## [1] 0.2194281
sum(is.na(gt_nfe.mx))/(nrow(gt_nfe.mx)*ncol(gt_nfe.mx)) # ~25%
## [1] 0.2469261
sum(is.na(gt_wecare.mx))/(nrow(gt_wecare.mx)*ncol(gt_wecare.mx)) # ~20%
## [1] 0.2093641
# --- Call rates per variant after gq-dp filtering --- #

# All samples
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-dp filtering\n739 wecare-nfe samples", xlab="Call rates")

# nfe
x <- ncol(gt_nfe.mx)
y <- apply(gt_nfe.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-dp filtering\n198 nfe samples", xlab="Call rates")

# wecare
x <- ncol(gt_wecare.mx)
y <- apply(gt_wecare.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-dp filtering\n541 wecare samples", xlab="Call rates")

# --- gq  after gq-dp filtering (when gt is not NA !) --- #

# All samples
hist(gq.mx[!is.na(gt.mx)], breaks=50, xlim=c(0,100), 
     main="Histogram of gq in non-NA genotypes after gq-dp filtering\n739 wecare-nfe samples", xlab="gq")

hist(gq_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, xlim=c(0,100), 
     main="Histogram of gq in non-NA genotypes after gq-dp filtering\n198 nfe samples", xlab="gq")

hist(gq_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, xlim=c(0,100), 
     main="Histogram of gq in non-NA genotypes after gq-dp filtering\n541 wecare samples", xlab="gq")

# --- dp after gq-dp filtering (when gt is not NA !) --- #

# All samples
hist(dp.mx[!is.na(gt.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after gq-dp filtering\n739 wecare-nfe samples", xlab="dp")

hist(dp.mx[!is.na(gt.mx)], breaks=250, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after gq-dp filtering (zoom, 0:100)\n739 wecare-nfe samples", xlab="dp")

# nfe
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after gq-dp filtering\n198 nfe samples", xlab="dp")

hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=250, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after gq-dp filtering (zoom, 0:100)\n198 nfe samples", xlab="dp")

# wecare
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after gq-dp filtering\n541 wecare samples", xlab="dp")

hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=250, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after gq-dp filtering (zoom, 0:100)\n541 wecare samples", xlab="dp")

# Mean GQ and DP in non-NA genotypes after gq-dp filtering
mean(gq.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 84
## [1] 84.28232
mean(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 73
## [1] 73.08944
# Clean-up
rm(x,y, gt_nfe.mx, gq_nfe.mx, dp_nfe.mx, gt_wecare.mx, gq_wecare.mx, dp_wecare.mx)

Filtering by fraction of reads supporting ALt allele

Check Alt allele fraction in Hom-Ref, Het and Hom-Alt

# Prepare alt_fraction.mx
alt_fraction.mx <- alt.mx / (alt.mx + ref.mx)
dim(alt_fraction.mx)
## [1] 8275  739
sum(is.na(alt_fraction.mx))
## [1] 336781
sum(is.na(gt.mx))
## [1] 1341852
# Only consider alt_fraction, where genotypes are not NA
NA -> alt_fraction.mx[is.na(gt.mx)]

sum(is.na(alt_fraction.mx))
## [1] 1348217
sum(is.na(gt.mx))
## [1] 1341852
# Explore alt_fraction in hom.ref

alt_fraction_hom_ref.mx <- alt_fraction.mx
NA -> alt_fraction_hom_ref.mx[gt.mx != 0]
sum(!is.na(alt_fraction_hom_ref.mx))
## [1] 4530694
hist(alt_fraction_hom_ref.mx, xlim=c(0,1))

sum(alt_fraction_hom_ref.mx > 0.2, na.rm = T)
## [1] 81
# Explore alt_fraction in het

alt_fraction_het.mx <- alt_fraction.mx
NA -> alt_fraction_het.mx[gt.mx != 1]
sum(!is.na(alt_fraction_het.mx))
## [1] 154851
hist(alt_fraction_het.mx, xlim=c(0,1))

sum(alt_fraction_het.mx < 0.2, na.rm = T)
## [1] 5988
sum(alt_fraction_het.mx > 0.8, na.rm = T)
## [1] 1341
# Explore alt_fraction in hom.alt

alt_fraction_hom_alt.mx <- alt_fraction.mx
NA -> alt_fraction_hom_alt.mx[gt.mx != 2]
sum(!is.na(alt_fraction_hom_alt.mx))
## [1] 81463
hist(alt_fraction_hom_alt.mx, xlim=c(0,1))

sum(alt_fraction_hom_alt.mx < 0.2, na.rm = T)
## [1] 31
# Clean-up (keep matrices needed in teh next chunk!)
rm(alt_fraction.mx)

Update genotypes matrix

NA -> gt.mx[alt_fraction_hom_ref.mx > 0.2]
NA -> gt_chr.mx[alt_fraction_hom_ref.mx > 0.2]

NA -> gt.mx[alt_fraction_het.mx > 0.8]
NA -> gt_chr.mx[alt_fraction_het.mx > 0.8]

NA -> gt.mx[alt_fraction_het.mx < 0.2]
NA -> gt_chr.mx[alt_fraction_het.mx < 0.2]

NA -> gt.mx[alt_fraction_hom_alt.mx < 0.8]
NA -> gt_chr.mx[alt_fraction_hom_alt.mx < 0.8]

rm(alt_fraction_hom_ref.mx, alt_fraction_het.mx, alt_fraction_hom_alt.mx)

Repeat assessment of allelic fractions in Hom-Ref, Het and Hom-Alt

# Prepare alt_fraction.mx
alt_fraction.mx <- alt.mx / (alt.mx + ref.mx)
dim(alt_fraction.mx)
## [1] 8275  739
sum(is.na(alt_fraction.mx))
## [1] 336781
sum(is.na(gt.mx))
## [1] 1349295
# Only consider alt_fraction, where genotypes are not NA
NA -> alt_fraction.mx[is.na(gt.mx)]

sum(is.na(alt_fraction.mx))
## [1] 1355660
sum(is.na(gt.mx))
## [1] 1349295
# Explore alt_fraction in hom.ref

alt_fraction_hom_ref.mx <- alt_fraction.mx
NA -> alt_fraction_hom_ref.mx[gt.mx != 0]
sum(!is.na(alt_fraction_hom_ref.mx))
## [1] 4530613
hist(alt_fraction_hom_ref.mx, xlim=c(0,1))

sum(alt_fraction_hom_ref.mx > 0.2, na.rm = T)
## [1] 0
# Explore alt_fraction in het

alt_fraction_het.mx <- alt_fraction.mx
NA -> alt_fraction_het.mx[gt.mx != 1]
sum(!is.na(alt_fraction_het.mx))
## [1] 147522
hist(alt_fraction_het.mx, xlim=c(0,1))

sum(alt_fraction_het.mx < 0.2, na.rm = T)
## [1] 0
sum(alt_fraction_het.mx > 0.8, na.rm = T)
## [1] 0
# Explore alt_fraction in hom.alt

alt_fraction_hom_alt.mx <- alt_fraction.mx
NA -> alt_fraction_hom_alt.mx[gt.mx != 2]
sum(!is.na(alt_fraction_hom_alt.mx))
## [1] 81430
hist(alt_fraction_hom_alt.mx, xlim=c(0,1))

sum(alt_fraction_hom_alt.mx < 0.2, na.rm = T)
## [1] 0
# Clean-up
rm(alt_fraction.mx, alt_fraction_hom_ref.mx, alt_fraction_het.mx, 
   alt_fraction_hom_alt.mx, ref.mx, alt.mx)

explore_data_after_gq_dp_af_filtering

# Prepare matrices with sub-groups data
gt_nfe.mx <- gt.mx[,542:739]
gq_nfe.mx <- gq.mx[,542:739]
dp_nfe.mx <- dp.mx[,542:739]
gt_wecare.mx <- gt.mx[,1:541]
gq_wecare.mx <- gq.mx[,1:541]
dp_wecare.mx <- dp.mx[,1:541]

# --- Fractions of NA genotypes after gq-dp-af filtering --- #

sum(is.na(gt.mx))/(nrow(gt.mx)*ncol(gt.mx)) # ~22%
## [1] 0.2206452
sum(is.na(gt_nfe.mx))/(nrow(gt_nfe.mx)*ncol(gt_nfe.mx)) # ~25%
## [1] 0.2472721
sum(is.na(gt_wecare.mx))/(nrow(gt_wecare.mx)*ncol(gt_wecare.mx)) # ~21%
## [1] 0.2109
# --- Call rates per variant after gq-dp-af filtering --- #

# All samples
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-dp-af filtering\n739 wecare-nfe samples", xlab="Call rates")

# nfe
x <- ncol(gt_nfe.mx)
y <- apply(gt_nfe.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-dp-af filtering\n198 nfe samples", xlab="Call rates")

# wecare
x <- ncol(gt_wecare.mx)
y <- apply(gt_wecare.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, main="Call rates per variant after gq-dp-af filtering\n541 wecare samples", xlab="Call rates")

# --- gq  after gq-dp-af filtering (when gt is not NA !) --- #

# All samples
hist(gq.mx[!is.na(gt.mx)], breaks=50, xlim=c(0,100), 
     main="Histogram of gq in non-NA genotypes after gq-dp-af filtering\n739 wecare-nfe samples", xlab="gq")

hist(gq_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, xlim=c(0,100), 
     main="Histogram of gq in non-NA genotypes after gq-dp-af filtering\n198 nfe samples", xlab="gq")

hist(gq_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, xlim=c(0,100), 
     main="Histogram of gq in non-NA genotypes after gq-dp-af filtering\n541 wecare samples", xlab="gq")

# --- dp after gq-dp-af filtering (when gt is not NA !) --- #

# All samples
hist(dp.mx[!is.na(gt.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after gq-dp-af filtering\n739 wecare-nfe samples", xlab="dp")

hist(dp.mx[!is.na(gt.mx)], breaks=250, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after gq-dp-af filtering (zoom, 0:100)\n739 wecare-nfe samples", xlab="dp")

# nfe
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after gq-dp-af filtering\n198 nfe samples", xlab="dp")

hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=250, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after gq-dp-af filtering (zoom, 0:100)\n198 nfe samples", xlab="dp")

# wecare
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after gq-dp-af filtering\n541 wecare samples", xlab="dp")

hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=250, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after gq-dp-af filtering (zoom, 0:100)\n541 wecare samples", xlab="dp")

# Mean GQ and DP in non-NA genotypes after gq-dp-af filtering
mean(gq.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 84
## [1] 84.31422
mean(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 73
## [1] 73.09395
# Clean-up
rm(x,y, gt_nfe.mx, gq_nfe.mx, dp_nfe.mx, gt_wecare.mx, gq_wecare.mx, dp_wecare.mx)

filter_variants_by_final_call_rate

Remove variants with call rate < 85% in either nfe or wecare after the genotypes filtering:
removes 6,065 (~73%) variants (8,275 -> 2,210)
Most of the variants were removed because of low call rate in NFE

dim(gt.mx)
## [1] 8275  739
# Estimate callrates in WECARE samples
wecare_cases <- c(rep(T,541),rep(F,198))
length(wecare_cases)
## [1] 739
sum(wecare_cases)
## [1] 541
x_wecare <- ncol(gt.mx[,wecare_cases])
y_wecare <- apply(gt.mx[,wecare_cases], 1, function(z){1-sum(is.na(z))/x_wecare})
length(y_wecare)
## [1] 8275
y_wecare[1:7]
## Var000000001 Var000000002 Var000000003 Var000000004 Var000000005 Var000000006 Var000000010 
##    0.9889094    0.9889094    0.9426987    0.7338262    0.5730129    0.7707948    0.9020333
var_wecare.retained <- y_wecare >= min.call.rate
sum(var_wecare.retained) # 4,916 to be retained
## [1] 4916
1 - sum(var_wecare.retained)/nrow(gt.mx[,wecare_cases]) # ~40% to be removed
## [1] 0.4059215
# Estimate callrates in NFE samples
nfe_cases <- !wecare_cases
sum(nfe_cases)
## [1] 198
x_nfe <- ncol(gt.mx[,nfe_cases])
y_nfe <- apply(gt.mx[,nfe_cases], 1, function(z){1-sum(is.na(z))/x_nfe})
length(y_nfe)
## [1] 8275
y_nfe[1:7]
## Var000000001 Var000000002 Var000000003 Var000000004 Var000000005 Var000000006 Var000000010 
##    0.3636364    0.8282828    0.8585859    0.9090909    0.9090909    0.9141414    0.5151515
var_nfe.retained <- y_nfe >= min.call.rate
sum(var_nfe.retained) # 3,565 to be retained
## [1] 3565
1 - sum(var_nfe.retained)/nrow(gt.mx[,nfe_cases]) # ~57% to be removed
## [1] 0.5691843
# Estimate the proportion of variants to be removed
var.retained <- var_wecare.retained & var_nfe.retained
sum(var.retained) # 2,210 to be retained
## [1] 2210
1 - sum(var.retained)/nrow(gt.mx) # ~73% to be removed
## [1] 0.7329305
# Remove variants with loaw call rates
gt.mx <- gt.mx[ var.retained, ]
gt_chr.mx <- gt_chr.mx[ var.retained, ]

dp.mx <- dp.mx[ var.retained, ]
gq.mx <- gq.mx[ var.retained, ]
fixed.df <- fixed.df[ var.retained, ]

# Clean-up
rm(min.call.rate, var.retained, x_wecare, y_wecare, x_nfe, y_nfe, 
   wecare_cases, nfe_cases, var_wecare.retained, var_nfe.retained)

remove_variants_with_the_uniform_genotypes_accross_all_samples

Remove 324 variants: 2,210 -> 1,886
In some variants the diverse genotypes were removed during the above filtering

# Check that there is no all-NA variants
non_NA_count.udf <- function(x){sum(!is.na(x))}
all_NA <- apply(gt.mx, 1, non_NA_count.udf) == 0
sum(all_NA) # 0
## [1] 0
# Function to detect uniform numeric vector
uniform_vector.udf <- function(x){
  if(min(x, na.rm=TRUE) == max(x, na.rm=TRUE)){return(TRUE)} else {return(FALSE)}}

# Variants with uniform genotypes accross all samples 
uniform_genotypes <- apply(gt.mx, 1, uniform_vector.udf)
summary(uniform_genotypes)
##    Mode   FALSE    TRUE 
## logical    1886     324
sum(uniform_genotypes) # 324
## [1] 324
# Remove variants with uniform genotypes accross all samples
gt.mx <- gt.mx[!uniform_genotypes,]
gt_chr.mx <- gt_chr.mx[!uniform_genotypes,]
gq.mx <- gq.mx[!uniform_genotypes,]
dp.mx <- dp.mx[!uniform_genotypes,]

fixed.df <- fixed.df[!uniform_genotypes,]

dim(gt.mx)
## [1] 1886  739
dim(gt_chr.mx)
## [1] 1886  739
dim(gq.mx)
## [1] 1886  739
dim(dp.mx)
## [1] 1886  739
dim(fixed.df)
## [1] 1886  110
# Clean-up
rm(non_NA_count.udf, all_NA, uniform_vector.udf, uniform_genotypes)

explore_data_after_filtering

Genotypes NA rates
Histogram of call rates per variant
Histograms of gq and dp in non-NA genotypes

# Prepare matrices with sub-groups data
gt_nfe.mx <- gt.mx[,542:739]
gq_nfe.mx <- gq.mx[,542:739]
dp_nfe.mx <- dp.mx[,542:739]
gt_wecare.mx <- gt.mx[,1:541]
gq_wecare.mx <- gq.mx[,1:541]
dp_wecare.mx <- dp.mx[,1:541]

# --- Fractions of NA genotypes after filtering --- #

sum(is.na(gt.mx))/(nrow(gt.mx)*ncol(gt.mx)) # ~4.1%
## [1] 0.04149369
sum(is.na(gt_nfe.mx))/(nrow(gt_nfe.mx)*ncol(gt_nfe.mx)) # ~6.2%
## [1] 0.06234937
sum(is.na(gt_wecare.mx))/(nrow(gt_wecare.mx)*ncol(gt_wecare.mx)) # ~3.4%
## [1] 0.03386075
# --- Call rates per variant after filtering --- #

# All samples
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, main="Call rates per variant after filtering\n739 wecare-nfe samples", 
     xlim=c(0,1), xlab="Call rates")

# nfe
x <- ncol(gt_nfe.mx)
y <- apply(gt_nfe.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, main="Call rates per variant after genotypes filtering\n198 nfe samples", 
     xlim=c(0,1), xlab="Call rates")

# wecare
x <- ncol(gt_wecare.mx)
y <- apply(gt_wecare.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, main="Call rates per variant after genotypes filtering\n541 wecare samples",
     xlim=c(0,1), xlab="Call rates")

# --- gq  after filtering (when gt is not NA !) --- #

# All samples
hist(gq.mx[!is.na(gt.mx)], breaks=50, xlim=c(0,100),
     main="Histogram of gq in non-NA genotypes after filtering\n739 wecare-nfe samples", xlab="gq")

hist(gq_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, xlim=c(0,100),
     main="Histogram of gq in non-NA genotypes after filtering\n198 nfe samples", xlab="gq")

hist(gq_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, xlim=c(0,100),
     main="Histogram of gq in non-NA genotypes after filtering\n541 wecare samples", xlab="gq")

# --- dp after filtering (when gt is not NA !) --- #

# All samples
hist(dp.mx[!is.na(gt.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after filtering\n739 wecare-nfe samples", xlab="dp")

hist(dp.mx[!is.na(gt.mx)], breaks=250, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after filtering (zoom 0:100)\n739 wecare-nfe samples", xlab="dp")

# nfe
hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after filtering\n198 nfe samples", xlab="dp")

hist(dp_nfe.mx[!is.na(gt_nfe.mx)], breaks=250, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after filtering (zoom 0:100)\n198 nfe samples", xlab="dp")

# wecare
hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=50, 
     main="Histogram of dp in non-NA genotypes after filtering\n541 wecare samples", xlab="dp")

hist(dp_wecare.mx[!is.na(gt_wecare.mx)], breaks=250, xlim=c(0,100), 
     main="Histogram of dp in non-NA genotypes after filtering (zoom 0:100)\n541 wecare samples", xlab="dp")

# Mean GQ and DP in non-NA genotypes after filtering
mean(gq.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 92
## [1] 92.38478
mean(dp.mx[!is.na(gt.mx)], na.rm=TRUE) # ~ 76
## [1] 75.92664
quantile(gq.mx[!is.na(gt.mx)], na.rm=TRUE)
##   0%  25%  50%  75% 100% 
##   20   99   99   99   99
quantile(dp.mx[!is.na(gt.mx)], na.rm=TRUE)
##   0%  25%  50%  75% 100% 
##   10   34   42   70 1000
# Clean-up (keep dp for later comparison of samples)
rm(x,y, gt_nfe.mx, gq_nfe.mx, dp_nfe.mx, gt_wecare.mx, gq_wecare.mx, dp_wecare.mx, gq.mx)

data_summary

dim(gt.mx)
## [1] 1886  739
class(gt.mx)
## [1] "matrix"
gt.mx[1:5,1:5]
##              100_S8_L007 101_S528_L008 102_S466_L008 103_S147_L007 104_S325_L008
## Var000000003           0             0             0             0             0
## Var000000048           1             2             2             2             2
## Var000000073           0             0             0             0             0
## Var000000095           0             0             0             0             0
## Var000000096           0             0             0             0             0
dim(gt_chr.mx)
## [1] 1886  739
class(gt_chr.mx)
## [1] "matrix"
gt_chr.mx[1:5,1:5]
##              100_S8_L007 101_S528_L008 102_S466_L008 103_S147_L007 104_S325_L008
## Var000000003 "C/C"       "C/C"         "C/C"         "C/C"         "C/C"        
## Var000000048 "T/C"       "C/C"         "C/C"         "C/C"         "C/C"        
## Var000000073 "C/C"       "C/C"         "C/C"         "C/C"         "C/C"        
## Var000000095 "T/T"       "T/T"         "T/T"         "T/T"         "T/T"        
## Var000000096 "T/T"       "T/T"         "T/T"         "T/T"         "T/T"
dim(dp.mx)
## [1] 1886  739
class(dp.mx)
## [1] "matrix"
dp.mx[1:5,1:5]
##              100_S8_L007 101_S528_L008 102_S466_L008 103_S147_L007 104_S325_L008
## Var000000003         304           166           267           146           411
## Var000000048         316           192           145           124           246
## Var000000073         159            46           186            42            62
## Var000000095          58            75           278            63           344
## Var000000096          58            75           278            63           344
dim(fixed.df)
## [1] 1886  110
str(fixed.df)
## 'data.frame':    1886 obs. of  110 variables:
##  $ CHROM                  : chr  "1" "1" "1" "1" ...
##  $ POS                    : int  3669172 3677933 3680345 3683914 3683957 3683982 3683997 11735245 11736131 11737010 ...
##  $ ID                     : chr  "rs41315312" "rs1181883" "rs144028564" "rs2275839" ...
##  $ REF                    : chr  "C" "T" "C" "T" ...
##  $ ALT                    : chr  "T" "C" "T" "A" ...
##  $ QUAL                   : num  1167626 908688 46177 70073 198 ...
##  $ FILTER                 : chr  "PASS" "PASS" "PASS" "PASS" ...
##  $ init_AC                : chr  "116" "601" "8" "9" ...
##  $ init_AF                : chr  "0.080" "0.417" "5.634e-03" "6.259e-03" ...
##  $ init_AN                : int  1442 1442 1420 1438 1440 1440 1444 1436 1422 1454 ...
##  $ AS_FilterStatus        : chr  "PASS" "PASS" "PASS" "PASS" ...
##  $ AS_VQSLOD              : chr  "6.2410" "11.6456" "8.9794" "9.6262" ...
##  $ AS_culprit             : chr  "SOR" "FS" "FS" "FS" ...
##  $ BaseQRankSum           : num  0.742 -1.622 0 0.387 -0.12 ...
##  $ ClippingRankSum        : num  -0.108 -0.021 0.12 -0.389 0.168 0.452 0.095 0.087 0.24 0.715 ...
##  $ DB                     : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ DP                     : int  203695 74466 64575 119262 118118 110518 110860 244440 191835 127339 ...
##  $ DS                     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ END                    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ ExcessHet              : num  1.2 3.64 3.1 3.12 3.01 ...
##  $ FS                     : num  1.7 0 0 0 0 ...
##  $ HaplotypeScore         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ InbreedingCoeff        : num  0.0204 -0.0089 -0.0057 -0.0067 0.2668 ...
##  $ MLEAC                  : chr  "116" "600" "8" "9" ...
##  $ MLEAF                  : chr  "0.080" "0.416" "5.634e-03" "6.259e-03" ...
##  $ MQ                     : num  60 60 60 60 60 60 60 60 60 60 ...
##  $ MQRankSum              : num  -0.051 0.005 -0.465 0.331 -1.271 ...
##  $ NDA                    : int  1 3 1 1 1 1 2 3 1 1 ...
##  $ NEGATIVE_TRAIN_SITE    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ POSITIVE_TRAIN_SITE    : logi  TRUE TRUE TRUE TRUE TRUE FALSE ...
##  $ QD                     : num  16.45 14.48 11.19 16.22 7.09 ...
##  $ ReadPosRankSum         : num  0.621 0.397 0.376 0.367 0.216 3.11 0.664 0.849 -0.495 0.693 ...
##  $ SOR                    : num  1.173 0.667 0.702 0.252 0.77 ...
##  $ SplitVarID             : chr  "Var000000003" "Var000000048" "Var000000073" "Var000000095" ...
##  $ VQSLOD                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ culprit                : chr  NA NA NA NA ...
##  $ exac_non_TCGA.AC       : chr  "7859" "44134" "684" "2082" ...
##  $ exac_non_TCGA.AC_AFR   : chr  "496" "5759" "12" "11" ...
##  $ exac_non_TCGA.AC_AMR   : chr  "428" "5474" "64" "692" ...
##  $ exac_non_TCGA.AC_Adj   : chr  "7856" "44112" "684" "2075" ...
##  $ exac_non_TCGA.AC_EAS   : chr  "428" "2227" "0" "940" ...
##  $ exac_non_TCGA.AC_FEMALE: chr  "2983" "19316" "306" "1098" ...
##  $ exac_non_TCGA.AC_FIN   : chr  "563" "2545" "72" "105" ...
##  $ exac_non_TCGA.AC_Hemi  : chr  NA NA NA NA ...
##  $ exac_non_TCGA.AC_Het   : chr  "7192" "24906" "672" "1901" ...
##  $ exac_non_TCGA.AC_Hom   : chr  "332" "9603" "6" "87" ...
##  $ exac_non_TCGA.AC_MALE  : chr  "4873" "24796" "378" "977" ...
##  $ exac_non_TCGA.AC_NFE   : chr  "3675" "20768" "520" "93" ...
##  $ exac_non_TCGA.AC_OTH   : chr  "56" "280" "4" "11" ...
##  $ exac_non_TCGA.AC_SAS   : chr  "2210" "7059" "12" "223" ...
##  $ exac_non_TCGA.AF       : chr  "0.074" "0.416" "6.440e-03" "0.020" ...
##  $ exac_non_TCGA.AN       : int  106210 106210 106210 106210 106210 106210 106200 NA 106210 106206 ...
##  $ exac_non_TCGA.AN_AFR   : int  9058 9060 9060 9014 8922 8536 7914 NA 8902 8896 ...
##  $ exac_non_TCGA.AN_AMR   : int  11216 11194 11194 11148 11104 10784 9992 NA 11070 11114 ...
##  $ exac_non_TCGA.AN_Adj   : int  106196 106120 106140 105806 105114 101574 94348 NA 105034 105596 ...
##  $ exac_non_TCGA.AN_EAS   : int  7862 7842 7852 7842 7848 7722 7254 NA 7834 7852 ...
##  $ exac_non_TCGA.AN_FEMALE: chr  "45870" "45836" "45832" "45656" ...
##  $ exac_non_TCGA.AN_FIN   : int  6614 6606 6612 6572 6414 5988 5426 NA 6450 6594 ...
##  $ exac_non_TCGA.AN_MALE  : chr  "60326" "60284" "60308" "60150" ...
##  $ exac_non_TCGA.AN_NFE   : int  54344 54316 54330 54182 53942 52516 49330 NA 53998 54144 ...
##  $ exac_non_TCGA.AN_OTH   : int  694 694 694 686 672 642 590 NA 676 690 ...
##  $ exac_non_TCGA.AN_SAS   : int  16408 16408 16398 16362 16212 15386 13842 NA 16104 16306 ...
##  $ exac_non_TCGA.Hemi_AFR : chr  NA NA NA NA ...
##  $ exac_non_TCGA.Hemi_AMR : chr  NA NA NA NA ...
##  $ exac_non_TCGA.Hemi_EAS : chr  NA NA NA NA ...
##  $ exac_non_TCGA.Hemi_FIN : chr  NA NA NA NA ...
##  $ exac_non_TCGA.Hemi_NFE : chr  NA NA NA NA ...
##  $ exac_non_TCGA.Hemi_OTH : chr  NA NA NA NA ...
##  $ exac_non_TCGA.Hemi_SAS : chr  NA NA NA NA ...
##  $ exac_non_TCGA.Het_AFR  : chr  "476" "2111" "12" "11" ...
##  $ exac_non_TCGA.Het_AMR  : chr  "412" "2746" "64" "652" ...
##  $ exac_non_TCGA.Het_EAS  : chr  "412" "1565" "0" "814" ...
##  $ exac_non_TCGA.Het_FIN  : chr  "517" "1565" "72" "105" ...
##  $ exac_non_TCGA.Het_NFE  : chr  "3401" "12746" "510" "89" ...
##  $ exac_non_TCGA.Het_OTH  : chr  "48" "166" "4" "11" ...
##  $ exac_non_TCGA.Het_SAS  : chr  "1926" "4007" "10" "219" ...
##  $ exac_non_TCGA.Hom_AFR  : chr  "10" "1824" "0" "0" ...
##  $ exac_non_TCGA.Hom_AMR  : chr  "8" "1364" "0" "20" ...
##  $ exac_non_TCGA.Hom_EAS  : chr  "8" "331" "0" "63" ...
##  $ exac_non_TCGA.Hom_FIN  : chr  "23" "490" "0" "0" ...
##  $ exac_non_TCGA.Hom_NFE  : chr  "137" "4011" "5" "2" ...
##  $ exac_non_TCGA.Hom_OTH  : chr  "4" "57" "0" "0" ...
##  $ exac_non_TCGA.Hom_SAS  : chr  "142" "1526" "1" "2" ...
##  $ kgen.AC                : chr  "390" "2384" "17" "137" ...
##  $ kgen.AF                : chr  "0.0778754" "0.476038" "0.00339457" "0.0273562" ...
##  $ kgen.AFR_AF            : chr  "0.0461" "0.6944" "0.0015" "0.003" ...
##  $ kgen.AMR_AF            : chr  "0.0418" "0.4683" "0.0086" "0.0346" ...
##  $ kgen.AN                : int  5008 5008 5008 5008 5008 NA 5008 5008 5008 NA ...
##  $ kgen.EAS_AF            : chr  "0.0645" "0.2788" "0" "0.0883" ...
##  $ kgen.EUR_AF            : chr  "0.0875" "0.4354" "0.0089" "0.003" ...
##  $ kgen.SAS_AF            : chr  "0.1503" "0.4315" "0" "0.0174" ...
##  $ SYMBOL                 : chr  "CCDC27" "CCDC27" "CCDC27" "CCDC27" ...
##  $ Allele                 : chr  "T" "C" "T" "A" ...
##  $ Existing_variation     : chr  "rs41315312" "rs1181883" "rs144028564" "rs2275839" ...
##  $ Consequence            : chr  "missense_variant" "missense_variant" "missense_variant" "missense_variant" ...
##  $ IMPACT                 : chr  "MODERATE" "MODERATE" "MODERATE" "MODERATE" ...
##  $ CLIN_SIG               : chr  NA NA NA NA ...
##  $ cDNA_position          : chr  "211" "884" "1481" "1732" ...
##  $ CDS_position           : chr  "127" "800" "1397" "1648" ...
##   [list output truncated]
fixed.df[1:5,1:5]
##              CHROM     POS          ID REF ALT
## Var000000003     1 3669172  rs41315312   C   T
## Var000000048     1 3677933   rs1181883   T   C
## Var000000073     1 3680345 rs144028564   C   T
## Var000000095     1 3683914   rs2275839   T   A
## Var000000096     1 3683957 rs146757641   T   C
# Check consistence of rownames
sum(rownames(gt.mx) != rownames(gt_chr.mx))
## [1] 0
sum(rownames(gt.mx) != rownames(dp.mx))
## [1] 0
sum(colnames(gt.mx) != colnames(gt_chr.mx))
## [1] 0
sum(colnames(gt.mx) != colnames(dp.mx))
## [1] 0
sum(rownames(gt.mx) != rownames(fixed.df))
## [1] 0

save_data

save.image(paste(base_folder, "r02_filter_genotypes.RData", sep="/"))

final_section

ls()
## [1] "base_folder" "dp.mx"       "fixed.df"    "gt_chr.mx"   "gt.mx"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] knitr_1.21
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.5.1  magrittr_1.5    tools_3.5.1     htmltools_0.3.6 yaml_2.2.0      Rcpp_1.0.0      stringi_1.2.4   rmarkdown_1.11  stringr_1.3.1   xfun_0.4        digest_0.6.18   evaluate_0.12
Sys.time()
## [1] "2019-04-23 15:01:12 BST"